##Importing important libraries used for upcoming anaylsis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import seaborn as sns
#accessing the dataset
df = pd.read_excel("C:\\Users\\aiman\\Downloads\\df.xlsx")
#printing first 5 rows
df.head()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 0 | 2020-37 | -37.519318 | ['25', '24'] | A2 - geschreddert | 2020-09-14 |
| 1 | 2020-39 | -40.000000 | ['25', '24'] | A2 - geschreddert | 2020-09-28 |
| 2 | 2020-41 | -34.853543 | ['25', '24'] | A2 - geschreddert | 2020-10-12 |
| 3 | 2020-42 | -27.560202 | ['25', '24'] | A2 - geschreddert | 2020-10-19 |
| 4 | 2020-43 | -31.937780 | ['25', '24'] | A2 - geschreddert | 2020-10-26 |
we observe that data starts from 14 september, 2020.
df.tail()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 3374 | 2023-32 | 49.330752 | ['80', '81', '82', '83', '84', '85', '93', '94'] | A2 & A3 - geschreddert | 2023-08-07 |
| 3375 | 2023-33 | 50.000000 | ['80', '81', '82', '83', '84', '85', '93', '94'] | A2 & A3 - geschreddert | 2023-08-14 |
| 3376 | 2023-34 | 49.144626 | ['80', '81', '82', '83', '84', '85', '93', '94'] | A2 & A3 - geschreddert | 2023-08-21 |
| 3377 | 2023-35 | 44.990338 | ['80', '81', '82', '83', '84', '85', '93', '94'] | A2 & A3 - geschreddert | 2023-08-28 |
| 3378 | 2023-36 | 45.375906 | ['80', '81', '82', '83', '84', '85', '93', '94'] | A2 & A3 - geschreddert | 2023-09-04 |
we observe that data is until 4 september, 2023.
#checking the count, null values and data type of the columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3379 entries, 0 to 3378 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 week 3379 non-null object 1 wPreis 3379 non-null float64 2 Plz 3379 non-null object 3 full 3379 non-null object 4 date 3379 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(1), object(3) memory usage: 132.1+ KB
we can observe above, there are no null values. Total colums are 5 and entries is 3379. Full, week and Plz(area clusters) are objects.
#checking how many unique values we have for clusters
df['Plz'].unique()
array(["['25', '24']", "['26', '27', '28']", "['20', '21', '22', '23']",
"['29', '30', '31', '38', '39']",
"['10', '11', '12', '13', '14', '15', '16']", "['48', '49']",
"['1', '4', '6', '7', '8', '9']",
"['40', '41', '42', '44', '45', '46', '47']",
"['50', '51', '52', '53']",
"['80', '81', '82', '83', '84', '85', '93', '94']"], dtype=object)
we observe that there are 10 unique clusters
#checking how many unique values we have for 'full' that is the waste category
df['full'].unique()
array(['A2 - geschreddert', 'A1 & A2 - geschreddert',
'A2 & A3 - geschreddert', 'A3 - geschreddert'], dtype=object)
we observe that there are 4 unique categories
So we want to find and observe if the price fluctuations of electricity in Germany is affecting the price of waste products. Therefore, we will first load the electricity dataset. Thereafter, we will explore based on different clusters and different waste categories
The electricity dataset has been fetched from https://www.netztransparenz.de/EEG/Marktpraemie/Spotmarktpreis
#loading the electricity dataset
df_Electricity = pd.read_csv("Electricity_preis.csv",sep=';')
#printing first 5 rows
df_Electricity.head()
| Datum | von | Zeitzone von | bis | Zeitzone bis | Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|
| 0 | 01.01.2021 | 00:00 | CET | 01:00 | CET | 5,087 |
| 1 | 01.01.2021 | 01:00 | CET | 02:00 | CET | 4,819 |
| 2 | 01.01.2021 | 02:00 | CET | 03:00 | CET | 4,468 |
| 3 | 01.01.2021 | 03:00 | CET | 04:00 | CET | 4,292 |
| 4 | 01.01.2021 | 04:00 | CET | 05:00 | CET | 4,039 |
we observe that the data available starts from January 1, 2021. However the data given in the waste price dataset starts from 14 September, 2020. Therefore, later we will need to trim our waste dataset to analyze.
NOTE: The data is given in hourly basis. That means, for each day we have 24 rows. Whereas, the waste price dataset contains information on weekly basis.
#printing last 5 rows
df_Electricity.tail()
| Datum | von | Zeitzone von | bis | Zeitzone bis | Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|
| 23346 | 31.08.2023 | 19:00 | CEST | 20:00 | CEST | 14,855 |
| 23347 | 31.08.2023 | 20:00 | CEST | 21:00 | CEST | 13,915 |
| 23348 | 31.08.2023 | 21:00 | CEST | 22:00 | CEST | 12,55 |
| 23349 | 31.08.2023 | 22:00 | CEST | 23:00 | CEST | 10,603 |
| 23350 | 31.08.2023 | 23:00 | CEST | 00:00 | CEST | 9,689 |
Here we observe see that data is until 31 september, 2023 whereas the waste data is until 4 september, 2023.
#Lets now see some basic information about this dataset
df_Electricity.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 23351 entries, 0 to 23350 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Datum 23351 non-null object 1 von 23351 non-null object 2 Zeitzone von 23351 non-null object 3 bis 23351 non-null object 4 Zeitzone bis 23351 non-null object 5 Spotmarktpreis in ct/kWh 23351 non-null object dtypes: object(6) memory usage: 1.1+ MB
we observe that it has 6 columns in total and 23351 entries. It does not have any null values. Also all the columns are of 'object' data type.
Now we can very clearly observe that few changes need to be made in our electricity dataset, so that we can effeciently and correctly do the analysis.
# Convert the 'Datum' column to datetime
df_Electricity['Datum'] = pd.to_datetime(df_Electricity['Datum'], format='%d.%m.%Y')
df_Electricity.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 23351 entries, 0 to 23350 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Datum 23351 non-null datetime64[ns] 1 von 23351 non-null object 2 Zeitzone von 23351 non-null object 3 bis 23351 non-null object 4 Zeitzone bis 23351 non-null object 5 Spotmarktpreis in ct/kWh 23351 non-null object dtypes: datetime64[ns](1), object(5) memory usage: 1.1+ MB
we see above that now our 'datum ' column is in 'datetime' type
# Clean 'Spotmarktpreis in ct/kWh' by removing commas and converting to float
df_Electricity['Spotmarktpreis in ct/kWh'] = df_Electricity['Spotmarktpreis in ct/kWh'].str.replace(',', '.').astype(float)
df_Electricity.head(3)
| Datum | von | Zeitzone von | bis | Zeitzone bis | Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|
| 0 | 2021-01-01 | 00:00 | CET | 01:00 | CET | 5.087 |
| 1 | 2021-01-01 | 01:00 | CET | 02:00 | CET | 4.819 |
| 2 | 2021-01-01 | 02:00 | CET | 03:00 | CET | 4.468 |
We can see that our Spotmarktpreis is now presented with 'dot' instead of 'comma'
Like mentioned before, the data is given on hourly basis, so we need to fix that by getting average Spotmarktpreis for each day, and then for each week.
# Group by 'Datum' and calculate the average of 'Spotmarktpreis in ct/kWh'
average_by_date = df_Electricity.groupby('Datum')['Spotmarktpreis in ct/kWh'].mean().reset_index()
#printing first 5 rows of new dataset
average_by_date.head(5)
| Datum | Spotmarktpreis in ct/kWh | |
|---|---|---|
| 0 | 2021-01-01 | 4.839833 |
| 1 | 2021-01-02 | 5.056250 |
| 2 | 2021-01-03 | 3.862250 |
| 3 | 2021-01-04 | 4.801792 |
| 4 | 2021-01-05 | 5.533708 |
we see above, now we have data in average for each day rather than hourly basis
#information about new dataset
average_by_date.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 973 entries, 0 to 972 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Datum 973 non-null datetime64[ns] 1 Spotmarktpreis in ct/kWh 973 non-null float64 dtypes: datetime64[ns](1), float64(1) memory usage: 15.3 KB
After calculating the average value of price for each day, we have now 973 rows
# Extract the week from the 'Datum' column and store it in a new column 'week'
average_by_date['week'] = average_by_date['Datum'].dt.isocalendar().week
#extracting the year component from the 'Datum' column and storing it in a new column called 'year.'
average_by_date['Year'] = average_by_date['Datum'].dt.year
average_by_date.head()
| Datum | Spotmarktpreis in ct/kWh | week | Year | |
|---|---|---|---|---|
| 0 | 2021-01-01 | 4.839833 | 53 | 2021 |
| 1 | 2021-01-02 | 5.056250 | 53 | 2021 |
| 2 | 2021-01-03 | 3.862250 | 53 | 2021 |
| 3 | 2021-01-04 | 4.801792 | 1 | 2021 |
| 4 | 2021-01-05 | 5.533708 | 1 | 2021 |
we have now added a new column with year mentioned for each row.
NOTE: Down below I will be trimming our waste dataset by removing some row so that my data starts from first week of 2021. Because as mentioned earlier, for electricity data we have information starting from 2021 January.
That is why here I am removing the first 3 rows so that my data for electricity as well starts from first week of 2021, which starts from 04.01.2021.
It can be observed that 01.01.2021 , 02.01.2021 , 03.01.2021 belongs to week 53 of year 2020.
#Dropping first 3 rows
average_by_date = average_by_date.drop([0, 1, 2])
average_by_date.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 970 entries, 3 to 972 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Datum 970 non-null datetime64[ns] 1 Spotmarktpreis in ct/kWh 970 non-null float64 2 week 970 non-null UInt32 3 Year 970 non-null int64 dtypes: UInt32(1), datetime64[ns](1), float64(1), int64(1) memory usage: 27.6 KB
We see now we have 970 rows after dropping 3 rows
Since our waste dataset has prices for weeks, we extract average weekly value for electricity dataset and also add a column mentioning the week
# Calculate the average weekly value
weekly_average = average_by_date.groupby(['Year', 'week'])['Spotmarktpreis in ct/kWh'].mean().reset_index()
# Rename the column
weekly_average.rename(columns={'Spotmarktpreis in ct/kWh': 'average_Spotmarktpreis in ct/kWh'}, inplace=True)
weekly_average.head(5)
| Year | week | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|
| 0 | 2021 | 1 | 5.954065 |
| 1 | 2021 | 2 | 5.576946 |
| 2 | 2021 | 3 | 4.438315 |
| 3 | 2021 | 4 | 5.455792 |
| 4 | 2021 | 5 | 4.511494 |
weekly_average.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 140 entries, 0 to 139 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 140 non-null int64 1 week 140 non-null UInt32 2 average_Spotmarktpreis in ct/kWh 140 non-null float64 dtypes: UInt32(1), float64(1), int64(1) memory usage: 3.0 KB
So after calculating average weekly price we have total 140 rows. It will be observed below that our waste price dataset will also have 140 entries for each waste category under each cluster, so that we are able to merge the two datasets for exploration.
#I was not happy with the name so renamed to a more appropriate one
new_df = weekly_average.copy() # Create a copy of the original DataFrame
new_df.rename_axis('Electricity_df', inplace=True)
Electricity_df.head()
| Year | week | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|
| 0 | 2021 | 1 | 5.954065 |
| 1 | 2021 | 2 | 5.576946 |
| 2 | 2021 | 3 | 4.438315 |
| 3 | 2021 | 4 | 5.455792 |
| 4 | 2021 | 5 | 4.511494 |
Electricity_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 140 entries, 0 to 139 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 140 non-null int64 1 week 140 non-null UInt32 2 average_Spotmarktpreis in ct/kWh 140 non-null float64 dtypes: UInt32(1), float64(1), int64(1) memory usage: 3.0 KB
In order to merge the datasets easily by a common column, I have added index column to the electricity dataset and will be doing same to waste price dataset.
After merging I will drop all the unneccesary columns.
#Adding indices so that merging is easier
Electricity_df = Electricity_df.reset_index().rename(columns={'index': 'index'})
Electricity_df.head()
| index | Year | week | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|
| 0 | 0 | 2021 | 1 | 5.954065 |
| 1 | 1 | 2021 | 2 | 5.576946 |
| 2 | 2 | 2021 | 3 | 4.438315 |
| 3 | 3 | 2021 | 4 | 5.455792 |
| 4 | 4 | 2021 | 5 | 4.511494 |
Since our electricity dataset has data from January 1, 2021 but our waste price dataset starts from September 14, 2020. Lets now extract data from our waste price dataset from January 1, 2021 so that we can find correlations.
df['date'] = pd.to_datetime(df['date'])
# Creating a mask to filter data for the desired date range
start_date = '2021-01-01' # Start date in yyyy-mm-dd format
end_date = '2023-12-31' # End date in yyyy-mm-dd format
mask = (df['date'] >= start_date) & (df['date'] <= end_date)
# Apply the mask to get the filtered DataFrame
New_df = df[mask]
New_df.head()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 14 | 2020-53 | -29.031922 | ['25', '24'] | A2 - geschreddert | 2021-01-04 |
| 15 | 2021-01 | -35.436666 | ['25', '24'] | A2 - geschreddert | 2021-01-04 |
| 16 | 2021-02 | -34.298460 | ['25', '24'] | A2 - geschreddert | 2021-01-11 |
| 17 | 2021-03 | -30.143425 | ['25', '24'] | A2 - geschreddert | 2021-01-18 |
| 18 | 2021-04 | -31.419623 | ['25', '24'] | A2 - geschreddert | 2021-01-25 |
Here we see that our data starts from January 4, 2021. And in the 'week' column we have 53rd week (2020-53) in our first row. This probably should be because January 1, 2021 is included in the week 53 along with 28,29,30,31 December 2020. The week 1 of the year 2021 starts from January 4.
This can be likely due to how weeks are defined in the "ISO week date" system. In the ISO week date system, a year does not always start on January 1st, and weeks are defined in a way that can result in the last days of December being counted as part of the first week of the following year.
NOTE: As i mentioned earlier, I am dropping the first row so that the data starts from week 1 of year 2021.
#dropping first row
New_df = New_df.drop(14)
New_df.head()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 15 | 2021-01 | -35.436666 | ['25', '24'] | A2 - geschreddert | 2021-01-04 |
| 16 | 2021-02 | -34.298460 | ['25', '24'] | A2 - geschreddert | 2021-01-11 |
| 17 | 2021-03 | -30.143425 | ['25', '24'] | A2 - geschreddert | 2021-01-18 |
| 18 | 2021-04 | -31.419623 | ['25', '24'] | A2 - geschreddert | 2021-01-25 |
| 19 | 2021-05 | -25.460273 | ['25', '24'] | A2 - geschreddert | 2021-02-01 |
I have removed the first row because it had data for year 2020 which will conflict with our Electricity data and obstruct in analyzing.
#Last 5 rows of the new dataset
New_df.tail()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 3374 | 2023-32 | 49.330752 | ['80', '81', '82', '83', '84', '85', '93', '94'] | A2 & A3 - geschreddert | 2023-08-07 |
| 3375 | 2023-33 | 50.000000 | ['80', '81', '82', '83', '84', '85', '93', '94'] | A2 & A3 - geschreddert | 2023-08-14 |
| 3376 | 2023-34 | 49.144626 | ['80', '81', '82', '83', '84', '85', '93', '94'] | A2 & A3 - geschreddert | 2023-08-21 |
| 3377 | 2023-35 | 44.990338 | ['80', '81', '82', '83', '84', '85', '93', '94'] | A2 & A3 - geschreddert | 2023-08-28 |
| 3378 | 2023-36 | 45.375906 | ['80', '81', '82', '83', '84', '85', '93', '94'] | A2 & A3 - geschreddert | 2023-09-04 |
we see above the last 5 rows of our new Dataframe which has the last date as September 4, 2023 and in week 36 of year 2023
Since the data is divided into clusters, we work with one cluster each time, to analyze how the price is being affected in each cluster.
So, lets first extract the data for Cluster ['29', '30', '31', '38', '39'] and then explore for each waste product category individually.
#Extracting a cluster data from dataset
Cluster_1 = New_df[(New_df['Plz'] == "['29', '30', '31', '38', '39']")]
Cluster_1.head()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 1258 | 2021-01 | -28.208196 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-04 |
| 1259 | 2021-02 | -27.511800 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-11 |
| 1260 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-18 |
| 1261 | 2021-04 | -24.969895 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-25 |
| 1262 | 2021-05 | -22.460770 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-02-01 |
Above we can see the first 5 rows for our cluster ['29', '30', '31', '38', '39'] from week 01 of year 2021.
Cluster_1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 420 entries, 1258 to 1709 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 week 420 non-null object 1 wPreis 420 non-null float64 2 Plz 420 non-null object 3 full 420 non-null object 4 date 420 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(1), object(3) memory usage: 19.7+ KB
we see above we have total 420 rows containing data for this clusters for 3 product types
Now lets check how many different types of waste product categories are found in these regions.
Cluster_1['full'].unique()
array(['A2 - geschreddert', 'A1 & A2 - geschreddert',
'A2 & A3 - geschreddert'], dtype=object)
As we can observe we have 3 types of product categories - 'A2 - geschreddert', 'A1 & A2 - geschreddert', 'A2 & A3 - geschreddert'.
Lets see the relation of prices of the 3 categories over the time period of January 2021 to September 2023
#relation of prices of the 3 categories over the time period of January 2021 to July 2023
fig = px.line(Cluster_1, x='date', y='wPreis', color='full', title='Weekly Prices for [29, 30, 31, 38, 39] by Category')
fig.show()
We can see that all three categories for this cluster are following almost the same pattern. Between the time period January 2021 to January 2022 we can observe that the price was quite low, maybe because COVID was still an issue then. But we see it starts to show good rise and reaches a peak of near about 80 for A2 and A2 & A3 around December 2022, whereas A1 & A2 showed highest reach of 97 around the same time. So we can say that all 3 categories gained value in one year.
However, it can be observed that since the beginning of the year 2023 there a downfall approximately 60% for all the categories.
Let us now explore if electricity prices shows some relation with the fluctuation of the waste price over the same time period
Since we dont have data for electricity price region wise, we compare with the country data.
Also, since our waste dataset for Cluster 1 has 420 entries in total (including all 3 categories) and our electricity dataset has 140 entries, we cannot merge these two datasets right now. Therefore we have to breakdown the cluster 1 dataset into 3 datasets, one for each category, so that each dataset has 140 entries. And then , we merge each of the new dataset (of each product category) with the electricity dataset and do exploration. That means we will have 3 merged datasets in total.
Cluster_1_A2 = Cluster_1[(Cluster_1['Plz'] == "['29', '30', '31', '38', '39']") & (Cluster_1['full'] == 'A2 - geschreddert')]
Cluster_1_A2.head()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 1258 | 2021-01 | -28.208196 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-04 |
| 1259 | 2021-02 | -27.511800 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-11 |
| 1260 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-18 |
| 1261 | 2021-04 | -24.969895 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-25 |
| 1262 | 2021-05 | -22.460770 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-02-01 |
Above we see the first 5 rows of data for the cluster - ['29', '30', '31', '38', '39'] of waste product type - A2 - geschreddert only
Cluster_1_A2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 140 entries, 1258 to 1397 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 week 140 non-null object 1 wPreis 140 non-null float64 2 Plz 140 non-null object 3 full 140 non-null object 4 date 140 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(1), object(3) memory usage: 6.6+ KB
Above we see we have 140 rows of data starting from week 1, 2021.
We remember that our electricity dataset as well had 140 rows of datat starting from week 1 of 2021
#adding index to the Cluster 1 , A2 category dataset , so that merging is easier
Cluster_1_A2 = Cluster_1_A2.reset_index().rename(columns={'index': 'index'})
Cluster_1_A2.head()
| index | week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|---|
| 0 | 1258 | 2021-01 | -28.208196 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-04 |
| 1 | 1259 | 2021-02 | -27.511800 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-11 |
| 2 | 1260 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-18 |
| 3 | 1261 | 2021-04 | -24.969895 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-25 |
| 4 | 1262 | 2021-05 | -22.460770 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-02-01 |
However I observed that the index is by default starting from 1258, which is not good for me, so I will drop this column and add a new index column that would start from '0'
Cluster_1_A2 = Cluster_1_A2.drop(columns=['index'])
Cluster_1_A2.head()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 0 | 2021-01 | -28.208196 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-04 |
| 1 | 2021-02 | -27.511800 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-11 |
| 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-18 |
| 3 | 2021-04 | -24.969895 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-25 |
| 4 | 2021-05 | -22.460770 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-02-01 |
Now we see the index column has been dropped
#Now we add a new index column
Cluster_1_A2 = Cluster_1_A2.reset_index().rename(columns={'index': 'index'})
#Index starting from '0' added
Cluster_1_A2.head()
| index | week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|---|
| 0 | 0 | 2021-01 | -28.208196 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-04 |
| 1 | 1 | 2021-02 | -27.511800 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-11 |
| 2 | 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-18 |
| 3 | 3 | 2021-04 | -24.969895 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-25 |
| 4 | 4 | 2021-05 | -22.460770 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-02-01 |
Cluster1_A1_A2 = Cluster_1[(Cluster_1['Plz'] == "['29', '30', '31', '38', '39']") & (Cluster_1['full'] == 'A1 & A2 - geschreddert')]
Cluster1_A1_A2.head()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 1414 | 2021-01 | -18.135079 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-04 |
| 1415 | 2021-02 | -25.667568 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-11 |
| 1416 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-18 |
| 1417 | 2021-04 | -23.926019 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-25 |
| 1418 | 2021-05 | -20.592872 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-02-01 |
Above we see the first 5 rows for the cluster -['29', '30', '31', '38', '39'] product category - 'A1 & A2 - geschreddert'
Cluster1_A1_A2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 140 entries, 1414 to 1553 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 week 140 non-null object 1 wPreis 140 non-null float64 2 Plz 140 non-null object 3 full 140 non-null object 4 date 140 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(1), object(3) memory usage: 6.6+ KB
Above we see we have 140 rows of data starting from week 1, 2021.
#adding index to the Cluster 1 , A1 & A2 category dataset , so that merging is easier
Cluster1_A1_A2 = Cluster1_A1_A2.reset_index().rename(columns={'index': 'index'})
Cluster1_A1_A2.head()
| index | week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|---|
| 0 | 1414 | 2021-01 | -18.135079 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-04 |
| 1 | 1415 | 2021-02 | -25.667568 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-11 |
| 2 | 1416 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-18 |
| 3 | 1417 | 2021-04 | -23.926019 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-25 |
| 4 | 1418 | 2021-05 | -20.592872 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-02-01 |
However I observed that the index is by default starting from 1414, which is not good for me, so I will drop this column and add a new index column that would start from '0'
Cluster1_A1_A2 = Cluster1_A1_A2.drop(columns=['index'])
Cluster1_A1_A2.head()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 0 | 2021-01 | -18.135079 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-04 |
| 1 | 2021-02 | -25.667568 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-11 |
| 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-18 |
| 3 | 2021-04 | -23.926019 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-25 |
| 4 | 2021-05 | -20.592872 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-02-01 |
Now we see the index column has been dropped
#Now we add a new index column
Cluster1_A1_A2 = Cluster1_A1_A2.reset_index().rename(columns={'index': 'index'})
#Index starting from '0' added
Cluster_1_A2.head()
| index | week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|---|
| 0 | 0 | 2021-01 | -28.208196 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-04 |
| 1 | 1 | 2021-02 | -27.511800 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-11 |
| 2 | 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-18 |
| 3 | 3 | 2021-04 | -24.969895 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-25 |
| 4 | 4 | 2021-05 | -22.460770 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-02-01 |
Cluster1_A2_A3 = Cluster_1[(Cluster_1['Plz'] == "['29', '30', '31', '38', '39']") & (Cluster_1['full'] == 'A2 & A3 - geschreddert')]
Cluster1_A2_A3.head()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 1570 | 2021-01 | -27.769906 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-04 |
| 1571 | 2021-02 | -26.538161 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-11 |
| 1572 | 2021-03 | -19.870230 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-18 |
| 1573 | 2021-04 | -24.063127 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-25 |
| 1574 | 2021-05 | -22.577891 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-02-01 |
Above we see the first 5 rows of data for the cluster - ['29', '30', '31', '38', '39'] of waste product type - A2 & A3 - geschreddert only
Cluster1_A2_A3.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 140 entries, 1570 to 1709 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 week 140 non-null object 1 wPreis 140 non-null float64 2 Plz 140 non-null object 3 full 140 non-null object 4 date 140 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(1), object(3) memory usage: 6.6+ KB
Above we see we have 140 rows of data starting from week 1, 2021.
We remember that our electricity dataset as well had 140 rows of datat starting from week 1 of 2021
#adding index to the Cluster 1 , A2 & A3 category dataset , so that merging is easier
Cluster1_A2_A3 = Cluster1_A2_A3.reset_index().rename(columns={'index': 'index'})
Cluster1_A2_A3.head()
| index | week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|---|
| 0 | 1570 | 2021-01 | -27.769906 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-04 |
| 1 | 1571 | 2021-02 | -26.538161 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-11 |
| 2 | 1572 | 2021-03 | -19.870230 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-18 |
| 3 | 1573 | 2021-04 | -24.063127 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-25 |
| 4 | 1574 | 2021-05 | -22.577891 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-02-01 |
However I observed that the index is by default starting from 1570, which is not good for me, so I will drop this column and add a new index column that would start from '0'
Cluster1_A2_A3 = Cluster1_A2_A3.drop(columns=['index'])
Cluster1_A2_A3.head()
| week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|
| 0 | 2021-01 | -27.769906 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-04 |
| 1 | 2021-02 | -26.538161 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-11 |
| 2 | 2021-03 | -19.870230 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-18 |
| 3 | 2021-04 | -24.063127 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-25 |
| 4 | 2021-05 | -22.577891 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-02-01 |
Now we see the index column has been dropped
#Now we add a new index column
Cluster1_A2_A3 = Cluster1_A2_A3.reset_index().rename(columns={'index': 'index'})
#Index starting from '0' added
Cluster1_A2_A3.head()
| index | week | wPreis | Plz | full | date | |
|---|---|---|---|---|---|---|
| 0 | 0 | 2021-01 | -27.769906 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-04 |
| 1 | 1 | 2021-02 | -26.538161 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-11 |
| 2 | 2 | 2021-03 | -19.870230 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-18 |
| 3 | 3 | 2021-04 | -24.063127 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-25 |
| 4 | 4 | 2021-05 | -22.577891 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-02-01 |
merged_A2_df = pd.merge(Cluster_1_A2, Electricity_df, left_index=True, right_index=True, how='inner')
merged_A2_df.head()
| index_x | week_x | wPreis | Plz | full | date | index_y | Year | week_y | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2021-01 | -28.208196 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-04 | 0 | 2021 | 1 | 5.954065 |
| 1 | 1 | 2021-02 | -27.511800 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-11 | 1 | 2021 | 2 | 5.576946 |
| 2 | 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-18 | 2 | 2021 | 3 | 4.438315 |
| 3 | 3 | 2021-04 | -24.969895 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-25 | 3 | 2021 | 4 | 5.455792 |
| 4 | 4 | 2021-05 | -22.460770 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-02-01 | 4 | 2021 | 5 | 4.511494 |
We see above, the two datasets have been merged now, but it has few columns that is not of our use, so I will drop them.
merged_A2_df = merged_A2_df.drop(columns=['index_x', 'index_y', 'week_y', 'Year'])
merged_A2_df.head()
| week_x | wPreis | Plz | full | date | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|
| 0 | 2021-01 | -28.208196 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-04 | 5.954065 |
| 1 | 2021-02 | -27.511800 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-11 | 5.576946 |
| 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-18 | 4.438315 |
| 3 | 2021-04 | -24.969895 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-25 | 5.455792 |
| 4 | 2021-05 | -22.460770 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-02-01 | 4.511494 |
#Renaming the column 'week_x' to 'week'
merged_A2_df = merged_A2_df.rename(columns={'week_x': 'Week'})
merged_A2_df.head()
| Week | wPreis | Plz | full | date | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|
| 0 | 2021-01 | -28.208196 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-04 | 5.954065 |
| 1 | 2021-02 | -27.511800 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-11 | 5.576946 |
| 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-18 | 4.438315 |
| 3 | 2021-04 | -24.969895 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-25 | 5.455792 |
| 4 | 2021-05 | -22.460770 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-02-01 | 4.511494 |
merged_A2_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 140 entries, 0 to 139 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Week 140 non-null object 1 wPreis 140 non-null float64 2 Plz 140 non-null object 3 full 140 non-null object 4 date 140 non-null datetime64[ns] 5 average_Spotmarktpreis in ct/kWh 140 non-null float64 dtypes: datetime64[ns](1), float64(2), object(3) memory usage: 6.7+ KB
NOTE: Observe that since our electricity and waste dataset had 140 rows, our merged dataset has 140 rows too.
merged_A1_A2_df = pd.merge(Cluster1_A1_A2, Electricity_df, left_index=True, right_index=True, how='inner')
merged_A1_A2_df.head()
| index_x | week_x | wPreis | Plz | full | date | index_y | Year | week_y | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2021-01 | -18.135079 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-04 | 0 | 2021 | 1 | 5.954065 |
| 1 | 1 | 2021-02 | -25.667568 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-11 | 1 | 2021 | 2 | 5.576946 |
| 2 | 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-18 | 2 | 2021 | 3 | 4.438315 |
| 3 | 3 | 2021-04 | -23.926019 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-25 | 3 | 2021 | 4 | 5.455792 |
| 4 | 4 | 2021-05 | -20.592872 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-02-01 | 4 | 2021 | 5 | 4.511494 |
We see above, the two datasets have been merged now, but it has few columns that is not of our use, so I will drop them.
merged_A1_A2_df = merged_A1_A2_df.drop(columns=['index_x', 'index_y', 'week_y', 'Year'])
merged_A1_A2_df.head()
| week_x | wPreis | Plz | full | date | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|
| 0 | 2021-01 | -18.135079 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-04 | 5.954065 |
| 1 | 2021-02 | -25.667568 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-11 | 5.576946 |
| 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-18 | 4.438315 |
| 3 | 2021-04 | -23.926019 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-25 | 5.455792 |
| 4 | 2021-05 | -20.592872 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-02-01 | 4.511494 |
#Renaming the column 'week_x' to 'week'
merged_A1_A2_df = merged_A1_A2_df.rename(columns={'week_x': 'Week'})
merged_A1_A2_df.head()
| Week | wPreis | Plz | full | date | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|
| 0 | 2021-01 | -18.135079 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-04 | 5.954065 |
| 1 | 2021-02 | -25.667568 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-11 | 5.576946 |
| 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-18 | 4.438315 |
| 3 | 2021-04 | -23.926019 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-25 | 5.455792 |
| 4 | 2021-05 | -20.592872 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-02-01 | 4.511494 |
merged_A1_A2_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 140 entries, 0 to 139 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Week 140 non-null object 1 wPreis 140 non-null float64 2 Plz 140 non-null object 3 full 140 non-null object 4 date 140 non-null datetime64[ns] 5 average_Spotmarktpreis in ct/kWh 140 non-null float64 dtypes: datetime64[ns](1), float64(2), object(3) memory usage: 6.7+ KB
merged_A2_A3_df = pd.merge(Cluster1_A2_A3, Electricity_df, left_index=True, right_index=True, how='inner')
merged_A2_A3_df.head()
| index_x | week_x | wPreis | Plz | full | date | index_y | Year | week_y | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2021-01 | -27.769906 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-04 | 0 | 2021 | 1 | 5.954065 |
| 1 | 1 | 2021-02 | -26.538161 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-11 | 1 | 2021 | 2 | 5.576946 |
| 2 | 2 | 2021-03 | -19.870230 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-18 | 2 | 2021 | 3 | 4.438315 |
| 3 | 3 | 2021-04 | -24.063127 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-25 | 3 | 2021 | 4 | 5.455792 |
| 4 | 4 | 2021-05 | -22.577891 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-02-01 | 4 | 2021 | 5 | 4.511494 |
We see above, the two datasets have been merged now, but it has few columns that is not of our use, so I will drop them.
merged_A2_A3_df = merged_A2_A3_df.drop(columns=['index_x', 'index_y', 'week_y', 'Year'])
merged_A2_A3_df.head()
| week_x | wPreis | Plz | full | date | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|
| 0 | 2021-01 | -27.769906 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-04 | 5.954065 |
| 1 | 2021-02 | -26.538161 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-11 | 5.576946 |
| 2 | 2021-03 | -19.870230 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-18 | 4.438315 |
| 3 | 2021-04 | -24.063127 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-25 | 5.455792 |
| 4 | 2021-05 | -22.577891 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-02-01 | 4.511494 |
#Renaming the column 'week_x' to 'week'
merged_A2_A3_df = merged_A2_A3_df.rename(columns={'week_x': 'Week'})
merged_A2_A3_df.head()
| Week | wPreis | Plz | full | date | average_Spotmarktpreis in ct/kWh | |
|---|---|---|---|---|---|---|
| 0 | 2021-01 | -27.769906 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-04 | 5.954065 |
| 1 | 2021-02 | -26.538161 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-11 | 5.576946 |
| 2 | 2021-03 | -19.870230 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-18 | 4.438315 |
| 3 | 2021-04 | -24.063127 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-25 | 5.455792 |
| 4 | 2021-05 | -22.577891 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-02-01 | 4.511494 |
merged_A2_A3_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 140 entries, 0 to 139 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Week 140 non-null object 1 wPreis 140 non-null float64 2 Plz 140 non-null object 3 full 140 non-null object 4 date 140 non-null datetime64[ns] 5 average_Spotmarktpreis in ct/kWh 140 non-null float64 dtypes: datetime64[ns](1), float64(2), object(3) memory usage: 6.7+ KB
#lets check the min max value first
merged_A2_df.describe()
| wPreis | average_Spotmarktpreis in ct/kWh | |
|---|---|---|
| count | 140.000000 | 140.000000 |
| mean | 20.997726 | 14.887093 |
| std | 33.938080 | 10.356326 |
| min | -30.637729 | 1.491333 |
| 25% | -17.572246 | 7.798090 |
| 50% | 24.186272 | 11.479634 |
| 75% | 47.681561 | 18.775909 |
| max | 82.068525 | 58.591821 |
# Create a line chart
fig = px.line(merged_A2_df, x='date', y=['wPreis', 'average_Spotmarktpreis in ct/kWh'],
labels={'date': 'Date', 'value': 'Price'},
title='Waste Price vs. Electricity Price Over Time for A2')
# Calculate correlation coefficient
correlation = merged_A2_df['wPreis'].corr(merged_A2_df['average_Spotmarktpreis in ct/kWh'])
correlation_text = f"Correlation coefficient: {correlation:.2f}"
# Add the correlation coefficient as an annotation on the chart
fig.add_annotation(
text=correlation_text,
xref='paper',
yref='paper',
x=0.05,
y=0.9,
showarrow=False,
font=dict(size=12)
)
# Show the plot
fig.show()
We see a correlation of 0.50 which shows there's a moderate relation between the two prices. It specifically doesn't prove causation right now, but it may be that one is affecting the other.
The lower waste prices in Germany from January 2021 to January 2022 may be linked to reduced waste generation during the COVID-19 pandemic. The subsequent increase in May 2022 could be due to economic recovery and changes in recycling demand. The decline from February 2023 might result from fluctuations in global commodity markets and local recycling policies, reflecting the complex interplay of economic, environmental, and regulatory factors.
Also in the describe function above we see the min and max value for both prices -
Max Waste price - 82.068525 on Nov 7, 2022
Min Waste price - -30.637729 on July 5, 2021
Max Electricity price - 58.591821 on August 22, 2022
Min Waste price - 1.491333 on September, 2023
# Calculate the correlation matrix
correlation_matrix = merged_A2_df[['wPreis', 'average_Spotmarktpreis in ct/kWh']].corr()
# Create a heatmap using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap between Waste Price and Electricity Price')
plt.show()
#lets check min max vaule first
merged_A1_A2_df.describe()
| wPreis | average_Spotmarktpreis in ct/kWh | |
|---|---|---|
| count | 140.000000 | 140.000000 |
| mean | 25.378814 | 14.887093 |
| std | 34.592329 | 10.356326 |
| min | -27.386157 | 1.491333 |
| 25% | -13.834316 | 7.798090 |
| 50% | 28.107990 | 11.479634 |
| 75% | 50.460903 | 18.775909 |
| max | 97.395457 | 58.591821 |
# Create a line chart
fig = px.line(merged_A1_A2_df, x='date', y=['wPreis', 'average_Spotmarktpreis in ct/kWh'],
labels={'date': 'Date', 'value': 'Price'},
title='Waste Price vs. Electricity Price Over Time for A1 & A2')
# Calculate correlation coefficient
correlation = merged_A1_A2_df['wPreis'].corr(merged_A1_A2_df['average_Spotmarktpreis in ct/kWh'])
correlation_text = f"Correlation coefficient: {correlation:.2f}"
# Add the correlation coefficient as an annotation on the chart
fig.add_annotation(
text=correlation_text,
xref='paper',
yref='paper',
x=0.05,
y=0.9,
showarrow=False,
font=dict(size=12)
)
# Show the plot
fig.show()
Correlation = 0.48
We can see above that there is not much relation between the two prices. For this waste product type too we see before october 2021 suppliers had to pay to get waste. The price showed great increase in January 2023. One relation can be thought of but cannot be verified- if we observe closely, there a rise in electricity price on 22nd august and a peak rise in waste price on December 19, 2022. So we can say that there might be few months lag in the affect, but we cannot be sure.
Also in the describe function above we see the min and max value for both prices -
Max Waste price - 97.395457 on Dec 19, 2022
Min Waste price - -27.386157 on July 5, 2021
Max Electricity price - 58.591821 on August 22, 2022
Min Waste price - 1.491333 on September, 2023
# Calculate the correlation matrix
correlation_matrix = merged_A1_A2_df[['wPreis', 'average_Spotmarktpreis in ct/kWh']].corr()
# Create a heatmap using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap between Waste Price and Electricity Price')
plt.show()
#lets check the min max value
merged_A2_A3_df.describe()
| wPreis | average_Spotmarktpreis in ct/kWh | |
|---|---|---|
| count | 140.000000 | 140.000000 |
| mean | 20.469280 | 14.887093 |
| std | 33.758428 | 10.356326 |
| min | -29.852564 | 1.491333 |
| 25% | -18.096181 | 7.798090 |
| 50% | 23.311289 | 11.479634 |
| 75% | 46.545091 | 18.775909 |
| max | 81.975733 | 58.591821 |
# Create a line chart
fig = px.line(merged_A2_A3_df, x='date', y=['wPreis', 'average_Spotmarktpreis in ct/kWh'],
labels={'date': 'Date', 'value': 'Price'},
title='Waste Price vs. Electricity Price Over Time for A2 & A3')
# Calculate correlation coefficient
correlation = merged_A2_A3_df['wPreis'].corr(merged_A2_A3_df['average_Spotmarktpreis in ct/kWh'])
correlation_text = f"Correlation coefficient: {correlation:.2f}"
# Add the correlation coefficient as an annotation on the chart
fig.add_annotation(
text=correlation_text,
xref='paper',
yref='paper',
x=0.05,
y=0.9,
showarrow=False,
font=dict(size=12)
)
# Show the plot
fig.show()
Correlation = 0.50
We see theres 50% relation between the two prices. All 3 categories shows almost same relationship with electricity. However we can observe that both prices have started to decrease from february 2023, and for now both are almost at same level
Also in the describe function above we see the min and max value for both prices -
Max Waste price - 81.975733 on Oct 3, 2022
Min Waste price - -29.852564 on July 5, 2021
Max Electricity price - 58.591821 on August 22, 2022
Min Electricity price - 1.491333 on September, 2023
# Calculate the correlation matrix
correlation_matrix = merged_A2_A3_df[['wPreis', 'average_Spotmarktpreis in ct/kWh']].corr()
# Create a heatmap using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap between Waste Price and Electricity Price')
plt.show()
lagged_Electricity_df = Electricity_df.copy() # Make a copy of the original DataFrame
# Create a new column 'lagged_price' with a one-week lag
lagged_Electricity_df['lagged_price'] = lagged_Electricity_df['average_Spotmarktpreis in ct/kWh'].shift(1)
# Display the new DataFrame
print(lagged_Electricity_df)
index Year week average_Spotmarktpreis in ct/kWh lagged_price 0 0 2021 1 5.954065 NaN 1 1 2021 2 5.576946 5.954065 2 2 2021 3 4.438315 5.576946 3 3 2021 4 5.455792 4.438315 4 4 2021 5 4.511494 5.455792 .. ... ... ... ... ... 135 135 2023 32 6.682881 7.298726 136 136 2023 33 10.042887 6.682881 137 137 2023 34 12.177613 10.042887 138 138 2023 35 11.520000 12.177613 139 139 2023 52 1.491333 11.520000 [140 rows x 5 columns]
lagged_Electricity_df.head()
| index | Year | week | average_Spotmarktpreis in ct/kWh | lagged_price | |
|---|---|---|---|---|---|
| 0 | 0 | 2021 | 1 | 5.954065 | NaN |
| 1 | 1 | 2021 | 2 | 5.576946 | 5.954065 |
| 2 | 2 | 2021 | 3 | 4.438315 | 5.576946 |
| 3 | 3 | 2021 | 4 | 5.455792 | 4.438315 |
| 4 | 4 | 2021 | 5 | 4.511494 | 5.455792 |
we see above our first row now has a value of NaN for lagged price. Now lets merge this dataset with each of our waste product category dataset (A2, A1 & A2 and A2 & A3) separately for our cluster - ['29', '30', '31', '38', '39'].
#Merging first category product
merged_A2_lagged_df = pd.merge(Cluster_1_A2, lagged_Electricity_df, left_index=True, right_index=True, how='inner')
merged_A2_lagged_df.head()
| index_x | week_x | wPreis | Plz | full | date | index_y | Year | week_y | average_Spotmarktpreis in ct/kWh | lagged_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2021-01 | -28.208196 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-04 | 0 | 2021 | 1 | 5.954065 | NaN |
| 1 | 1 | 2021-02 | -27.511800 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-11 | 1 | 2021 | 2 | 5.576946 | 5.954065 |
| 2 | 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-18 | 2 | 2021 | 3 | 4.438315 | 5.576946 |
| 3 | 3 | 2021-04 | -24.969895 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-25 | 3 | 2021 | 4 | 5.455792 | 4.438315 |
| 4 | 4 | 2021-05 | -22.460770 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-02-01 | 4 | 2021 | 5 | 4.511494 | 5.455792 |
#drop unneccesary columns
merged_A2_lagged_df = merged_A2_lagged_df.drop(columns=['index_x', 'index_y', 'week_y', 'Year', 'average_Spotmarktpreis in ct/kWh'])
merged_A2_lagged_df.head()
| week_x | wPreis | Plz | full | date | lagged_price | |
|---|---|---|---|---|---|---|
| 0 | 2021-01 | -28.208196 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-04 | NaN |
| 1 | 2021-02 | -27.511800 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-11 | 5.954065 |
| 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-18 | 5.576946 |
| 3 | 2021-04 | -24.969895 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-01-25 | 4.438315 |
| 4 | 2021-05 | -22.460770 | ['29', '30', '31', '38', '39'] | A2 - geschreddert | 2021-02-01 | 5.455792 |
#Renaming the column 'week_x' to 'week'
merged_A2_lagged_df = merged_A2_lagged_df.rename(columns={'week_x': 'Week'})
merged_A2_lagged_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 140 entries, 0 to 139 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Week 140 non-null object 1 wPreis 140 non-null float64 2 Plz 140 non-null object 3 full 140 non-null object 4 date 140 non-null datetime64[ns] 5 lagged_price 139 non-null float64 dtypes: datetime64[ns](1), float64(2), object(3) memory usage: 6.7+ KB
We see above lagged price has only 139 non null values
merged_A2_lagged_df.describe()
| wPreis | lagged_price | |
|---|---|---|
| count | 140.000000 | 139.000000 |
| mean | 20.997726 | 14.983466 |
| std | 33.938080 | 10.330586 |
| min | -30.637729 | 3.582036 |
| 25% | -17.572246 | 7.875607 |
| 50% | 24.186272 | 11.520000 |
| 75% | 47.681561 | 18.845557 |
| max | 82.068525 | 58.591821 |
# Calculate the correlation matrix
correlation_matrix = merged_A2_lagged_df[['wPreis', 'lagged_price']].corr()
# Create a heatmap using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap between Waste Price and Electricity lagged Price For A2 category')
plt.show()
# Create a line chart
fig = px.line(merged_A2_lagged_df, x='date', y=['wPreis', 'lagged_price'],
labels={'date': 'Date', 'value': 'Price'},
title='Waste Price vs. Electricity lagged Price Over Time for A2')
# Calculate correlation coefficient
correlation = merged_A2_lagged_df['wPreis'].corr(merged_A2_lagged_df['lagged_price'])
correlation_text = f"Correlation coefficient: {correlation:.2f}"
# Add the correlation coefficient as an annotation on the chart
fig.add_annotation(
text=correlation_text,
xref='paper',
yref='paper',
x=0.05,
y=0.9,
showarrow=False,
font=dict(size=12)
)
# Show the plot
fig.show()
It still doesnt show much relation, almost same as original one. Except, few places we see quite close relationship like that in January - May 2022 ,December 2022, August 2022 and August 2023.
#Merging second category product
A1_A2_lagged_df = pd.merge(Cluster1_A1_A2, lagged_Electricity_df, left_index=True, right_index=True, how='inner')
A1_A2_lagged_df.head()
| index_x | week_x | wPreis | Plz | full | date | index_y | Year | week_y | average_Spotmarktpreis in ct/kWh | lagged_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2021-01 | -18.135079 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-04 | 0 | 2021 | 1 | 5.954065 | NaN |
| 1 | 1 | 2021-02 | -25.667568 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-11 | 1 | 2021 | 2 | 5.576946 | 5.954065 |
| 2 | 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-18 | 2 | 2021 | 3 | 4.438315 | 5.576946 |
| 3 | 3 | 2021-04 | -23.926019 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-25 | 3 | 2021 | 4 | 5.455792 | 4.438315 |
| 4 | 4 | 2021-05 | -20.592872 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-02-01 | 4 | 2021 | 5 | 4.511494 | 5.455792 |
#drop unneccesary columns
A1_A2_lagged_df = A1_A2_lagged_df.drop(columns=['index_x', 'index_y', 'week_y', 'Year', 'average_Spotmarktpreis in ct/kWh'])
A1_A2_lagged_df.head()
| week_x | wPreis | Plz | full | date | lagged_price | |
|---|---|---|---|---|---|---|
| 0 | 2021-01 | -18.135079 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-04 | NaN |
| 1 | 2021-02 | -25.667568 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-11 | 5.954065 |
| 2 | 2021-03 | -20.069035 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-18 | 5.576946 |
| 3 | 2021-04 | -23.926019 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-01-25 | 4.438315 |
| 4 | 2021-05 | -20.592872 | ['29', '30', '31', '38', '39'] | A1 & A2 - geschreddert | 2021-02-01 | 5.455792 |
#Renaming the column 'week_x' to 'week'
A1_A2_lagged_df = A1_A2_lagged_df.rename(columns={'week_x': 'Week'})
A1_A2_lagged_df.describe()
| wPreis | lagged_price | |
|---|---|---|
| count | 140.000000 | 139.000000 |
| mean | 25.378814 | 14.983466 |
| std | 34.592329 | 10.330586 |
| min | -27.386157 | 3.582036 |
| 25% | -13.834316 | 7.875607 |
| 50% | 28.107990 | 11.520000 |
| 75% | 50.460903 | 18.845557 |
| max | 97.395457 | 58.591821 |
# Calculate the correlation matrix
correlation_matrix = A1_A2_lagged_df[['wPreis', 'lagged_price']].corr()
# Create a heatmap using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap between Waste Price and Electricity lagged Price For A1 & A2 category')
plt.show()
# Create a line chart
fig = px.line(A1_A2_lagged_df, x='date', y=['wPreis', 'lagged_price'],
labels={'date': 'Date', 'value': 'Price'},
title='Waste Price vs. Electricity lagged Price Over Time for A1 & A2')
# Calculate correlation coefficient
correlation = A1_A2_lagged_df['wPreis'].corr(A1_A2_lagged_df['lagged_price'])
correlation_text = f"Correlation coefficient: {correlation:.2f}"
# Add the correlation coefficient as an annotation on the chart
fig.add_annotation(
text=correlation_text,
xref='paper',
yref='paper',
x=0.05,
y=0.9,
showarrow=False,
font=dict(size=12)
)
# Show the plot
fig.show()
correlation is 52%. It can be observed that the relationship is almost similar like the 'without lag one'. Actually as observed before all product categories share the same pattern. Also, like previous graph few peaks of both the lines matches.
#Merging third category product on index
A2_A3_lagged_df = pd.merge(Cluster1_A2_A3, lagged_Electricity_df, left_index=True, right_index=True, how='inner')
A2_A3_lagged_df.head()
| index_x | week_x | wPreis | Plz | full | date | index_y | Year | week_y | average_Spotmarktpreis in ct/kWh | lagged_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2021-01 | -27.769906 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-04 | 0 | 2021 | 1 | 5.954065 | NaN |
| 1 | 1 | 2021-02 | -26.538161 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-11 | 1 | 2021 | 2 | 5.576946 | 5.954065 |
| 2 | 2 | 2021-03 | -19.870230 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-18 | 2 | 2021 | 3 | 4.438315 | 5.576946 |
| 3 | 3 | 2021-04 | -24.063127 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-25 | 3 | 2021 | 4 | 5.455792 | 4.438315 |
| 4 | 4 | 2021-05 | -22.577891 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-02-01 | 4 | 2021 | 5 | 4.511494 | 5.455792 |
#drop unneccesary columns
A2_A3_lagged_df = A2_A3_lagged_df.drop(columns=['index_x', 'index_y', 'week_y', 'Year', 'average_Spotmarktpreis in ct/kWh'])
A2_A3_lagged_df.head()
| week_x | wPreis | Plz | full | date | lagged_price | |
|---|---|---|---|---|---|---|
| 0 | 2021-01 | -27.769906 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-04 | NaN |
| 1 | 2021-02 | -26.538161 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-11 | 5.954065 |
| 2 | 2021-03 | -19.870230 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-18 | 5.576946 |
| 3 | 2021-04 | -24.063127 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-01-25 | 4.438315 |
| 4 | 2021-05 | -22.577891 | ['29', '30', '31', '38', '39'] | A2 & A3 - geschreddert | 2021-02-01 | 5.455792 |
#Renaming the column 'week_x' to 'week'
A2_A3_lagged_df = A2_A3_lagged_df.rename(columns={'week_x': 'Week'})
A2_A3_lagged_df.describe()
| wPreis | lagged_price | |
|---|---|---|
| count | 140.000000 | 139.000000 |
| mean | 20.469280 | 14.983466 |
| std | 33.758428 | 10.330586 |
| min | -29.852564 | 3.582036 |
| 25% | -18.096181 | 7.875607 |
| 50% | 23.311289 | 11.520000 |
| 75% | 46.545091 | 18.845557 |
| max | 81.975733 | 58.591821 |
# Calculate the correlation matrix
correlation_matrix = A2_A3_lagged_df[['wPreis', 'lagged_price']].corr()
# Create a heatmap using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap between Waste Price and Electricity lagged Price For A2 & A3 category')
plt.show()
# Create a line chart
fig = px.line(A2_A3_lagged_df, x='date', y=['wPreis', 'lagged_price'],
labels={'date': 'Date', 'value': 'Price'},
title='Waste Price vs. Electricity lagged Price Over Time for A2 & A3')
# Calculate correlation coefficient
correlation = A2_A3_lagged_df['wPreis'].corr(A2_A3_lagged_df['lagged_price'])
correlation_text = f"Correlation coefficient: {correlation:.2f}"
# Add the correlation coefficient as an annotation on the chart
fig.add_annotation(
text=correlation_text,
xref='paper',
yref='paper',
x=0.05,
y=0.9,
showarrow=False,
font=dict(size=12)
)
# Show the plot
fig.show()
As others we see there is a slight shift in the electricity data but likewise others not a great relationship observed.